import torchaudio
import argparse
import json

def main():
    parser = argparse.ArgumentParser(description="example: python create_hifigan_manifest.py --tsv /checkpoint/felixkreuk/datasets/vctk/splits/vctk_16khz/train.tsv --km /checkpoint/felixkreuk/experiments/hubert/hubert_feats/vctk_16khz_km_100/train.km --km_type hubert_100km > ~/tmp/tmp_mani.txt")
    parser.add_argument("--tsv", required=True, help="path to fairseq tsv file")
    parser.add_argument("--km", required=True, help="path to a km file generated by HuBERT clustering")
    parser.add_argument("--km_type", required=True, help="name of the codes in the output json (for example: 'cpc_100km')")
    args = parser.parse_args()

    km_lines = open(args.km, "r").readlines()
    tsv_lines = open(args.tsv, "r").readlines()
    assert len(km_lines) == len(tsv_lines) - 1, "tsv and km files are not of the same length!"

    wav_root = tsv_lines[0].strip()
    tsv_lines = tsv_lines[1:]

    for tsv_line, km_line in zip(tsv_lines, km_lines):
        tsv_line, km_line = tsv_line.strip(), km_line.strip()
        wav_basename, wav_num_frames = tsv_line.split("\t")
        wav_path = wav_root + "/" + wav_basename
        wav_info = torchaudio.info(wav_path)
        assert int(wav_num_frames) == wav_info.num_frames, "tsv duration and actual duration don't match!"
        wav_duration = wav_info.num_frames / wav_info.sample_rate
        manifest_line = {"audio": wav_path, "duration": wav_duration, args.km_type: km_line}
        print(json.dumps(manifest_line))

if __name__ == "__main__":
    """
    usage:
    python create_hifigan_manifest.py \
            --tsv /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/valid.tsv \
            --km /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/valid.km \
            --km_type hubert \
            > /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/hifigan_valid_manifest.txt
    """
    main()
